Client Report - To Infinity and Beyond…wait wrong movie

Unit 5 Stretch

Author

Ezekial Curran

Show the code
import polars as pl
import numpy as np
from lets_plot import *
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import (
  classification_report, 
  accuracy_score, 
  recall_score, 
  precision_score, 
  f1_score
  )
# add the additional libraries you need to import for ML here

LetsPlot.setup_html(isolated_frame=True)
Show the code
# import your data here using pandas and the URL
url = "https://github.com/fivethirtyeight/data/raw/master/star-wars-survey/StarWars.csv"
df = pl.read_csv("StarWars.csv")

df_clean = df.rename({
    df.columns[1]: "seen",
    df.columns[2]: "fan",
    **{df.columns[i]: f"seen_epi_{['i', 'ii', 'iii', 'iv', 'v', 'vi'][i - 3]}" for i in range(3, 9)},
    **{df.columns[i]: f"rank_epi_{['i', 'ii', 'iii', 'iv', 'v', 'vi'][i - 9]}" for i in range(9, 15)},
    **{df.columns[i]: df[df.columns[i]][0].lower().replace(' ', '_') for i in range(15, 29)},
    df.columns[29]: "shot_first",
    df.columns[30]: "ex_uni",
    df.columns[31]: "fan_ex_uni",
    df.columns[32]: "fan_star_trek",
    **{df.columns[i]: df.columns[i].lower().replace(' ', '_') for i in range(33, 37)},
    df.columns[37]: "location"
})

df_clean = df_clean[1:]
Show the code
# Include and execute your code here
df_clean = df_clean.with_columns(
  pl.col("age").replace({
    "> 60": 4,
    "45-60": 3,
    "30-44": 2,
    "18-29": 1,
  }, default=0).alias("age_num")
)

df_clean = df_clean.with_columns(
  pl.col(df_clean.columns[i]).replace({
    "Very favorably": 6,
    "Somewhat favorably": 5,
    "Neither favorably nor unfavorably (neutral)": 4,
    "Somewhat unfavorably": 3,
    "Very unfavorably": 2,
    "Unfamiliar (N/A)": 1
  }, default=0).alias(df_clean.columns[i]) for i in range(15, 29)
)

df_clean = df_clean.with_columns(
  pl.when((pl.col(df_clean.columns[i]).is_null()))
  .then(pl.lit("no"))
  .otherwise(pl.lit("yes"))
  .alias(df_clean.columns[i]) for i in range(3, 9)
)

rank_cols = ['rank_epi_i', 'rank_epi_ii', 'rank_epi_iii', 'rank_epi_iv', 'rank_epi_v', 'rank_epi_vi']
df_clean = df_clean.with_columns(
  pl.col(i).fill_null(0) for i in rank_cols
)

df_clean = df_clean.with_columns(
  pl.col('household_income').replace({
    "$150,000+": 8,
    "$100,000 - $149,999": 6,
    "$50,000 - $99,999": 4,
    "$25,000 - $49,999": 2,
    "$0 - $24,999": 1
  }, default=0).alias("income")
)

cat_cols = ['seen', 'fan', 'seen_epi_i', 'seen_epi_ii', 'seen_epi_iii', 'seen_epi_iv', 'seen_epi_v', 'seen_epi_vi', 'shot_first', 'ex_uni', 'fan_ex_uni', 'fan_star_trek', 'gender', 'location', 'education']
df_clean = df_clean.drop(['age', 'household_income'])

enc = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
enc_ar = enc.fit_transform(df_clean[cat_cols])
enc_cols = list(enc.get_feature_names_out(cat_cols))
enc_df = pl.DataFrame(enc_ar, schema=enc_cols)
df_tot = pl.concat([df_clean.drop(cat_cols), enc_df], how='horizontal')


X = df_tot.drop(['RespondentID', 'income'])
y = df_tot.select('income')

display(df_clean)
shape: (1_186, 38)
RespondentID seen fan seen_epi_i seen_epi_ii seen_epi_iii seen_epi_iv seen_epi_v seen_epi_vi rank_epi_i rank_epi_ii rank_epi_iii rank_epi_iv rank_epi_v rank_epi_vi han_solo luke_skywalker princess_leia_organa anakin_skywalker obi_wan_kenobi emperor_palpatine darth_vader lando_calrissian boba_fett c-3p0 r2_d2 jar_jar_binks padme_amidala yoda shot_first ex_uni fan_ex_uni fan_star_trek gender education location age_num income
i64 str str str str str str str str str str str str str str i64 i64 i64 i64 i64 i64 i64 i64 i64 i64 i64 i64 i64 i64 str str str str str str str i64 i64
3292879998 "Yes" "Yes" "yes" "yes" "yes" "yes" "yes" "yes" "3" "2" "1" "4" "5" "6" 6 6 6 6 6 6 6 1 1 6 6 6 6 6 "I don't understand this questi… "Yes" "No" "No" "Male" "High school degree" "South Atlantic" 1 0
3292879538 "No" null "no" "no" "no" "no" "no" "no" "0" "0" "0" "0" "0" "0" 0 0 0 0 0 0 0 0 0 0 0 0 0 0 null null null "Yes" "Male" "Bachelor degree" "West South Central" 1 1
3292765271 "Yes" "No" "yes" "yes" "yes" "no" "no" "no" "1" "2" "3" "4" "5" "6" 5 5 5 5 5 1 1 1 1 1 1 1 1 1 "I don't understand this questi… "No" null "No" "Male" "High school degree" "West North Central" 1 1
3292763116 "Yes" "Yes" "yes" "yes" "yes" "yes" "yes" "yes" "5" "6" "1" "2" "4" "3" 6 6 6 6 6 5 6 5 3 6 6 6 6 6 "I don't understand this questi… "No" null "Yes" "Male" "Some college or Associate degr… "West North Central" 1 6
3292731220 "Yes" "Yes" "yes" "yes" "yes" "yes" "yes" "yes" "5" "4" "6" "2" "1" "3" 6 5 5 3 6 2 5 4 6 5 5 2 5 5 "Greedo" "Yes" "No" "No" "Male" "Some college or Associate degr… "West North Central" 1 6
3288388730 "Yes" "Yes" "yes" "yes" "yes" "yes" "yes" "yes" "5" "4" "6" "3" "2" "1" 6 5 5 5 6 5 5 5 5 6 6 5 5 6 "Han" "No" null "Yes" "Female" "Some college or Associate degr… "East North Central" 1 1
3288378779 "Yes" "Yes" "yes" "yes" "yes" "yes" "yes" "yes" "4" "5" "6" "2" "3" "1" 6 5 6 3 6 4 2 5 1 5 6 3 3 6 "I don't understand this questi… "No" null "Yes" "Female" "Bachelor degree" "Mountain" 2 4
3288375286 "No" null "no" "no" "no" "no" "no" "no" "0" "0" "0" "0" "0" "0" 0 0 0 0 0 0 0 0 0 0 0 0 0 0 null null null "No" "Female" "Bachelor degree" "Middle Atlantic" 2 4
3288373068 "Yes" "Yes" "yes" "yes" "yes" "yes" "yes" "yes" "4" "3" "6" "5" "2" "1" 6 4 6 6 6 4 6 5 6 5 5 6 5 6 "Han" "No" null "Yes" "Female" "Some college or Associate degr… "East North Central" 3 6
3288372923 "Yes" "No" "yes" "yes" "no" "no" "yes" "yes" "6" "1" "2" "3" "4" "5" 6 6 6 2 6 2 6 2 1 5 5 2 4 2 "I don't understand this questi… "No" null "No" "Female" "Graduate degree" "Pacific" 4 4
Show the code
# Helper functions for obtaining graph specific DataFrames
seen_cols = ["seen_epi_i_yes", "seen_epi_ii_yes", "seen_epi_iii_yes", "seen_epi_iv_yes", "seen_epi_v_yes", "seen_epi_vi_yes"]
rank_cols = ["rank_epi_i", "rank_epi_ii", "rank_epi_iii", "rank_epi_iv", "rank_epi_v", "rank_epi_vi",]
movies = ["The Phantom Menace", "Attack of the Clones", "Revenge of the Sith", "A New Hope", "The Empire Strikes Back", "Return of the Jedi"]

def GetSeen(data):
  data_seen = data.filter(
    pl.any_horizontal([pl.col(i) == 1 for i in seen_cols])
).select(seen_cols)

  seen_counts = [data_seen[col].value_counts().sort(col)["count"][1] for col in seen_cols]
  seen_percs = [i / data_seen.shape[0] for i in seen_counts]
  df_percs = pl.DataFrame({"movie": movies[::-1], "percentage": seen_percs[::-1]})
  df_percs = df_percs.with_columns(
    ((pl.col("percentage") * 100).round(0).cast(pl.Int64).cast(pl.String) + '%').alias('perc_label')
  )

  return df_percs

def GetRanks(data):
  data_rank = data.filter(
    pl.all_horizontal([pl.col(i) > 0 for i in seen_cols])
  ).select(rank_cols)

  # Episode 3 has 1 missing rank (when compared to the other movies the rank option left is 6)
  data_rank = data_rank.with_columns(pl.col('rank_epi_iii').replace("0", "6"))

  rank_counts = [data_rank.filter(pl.col(i) == "1").height for i in rank_cols]
  rank_percs = [i / data_rank.height for i in rank_counts]
  data_rank_percs = pl.DataFrame({"movie": movies[::-1], "percentage": rank_percs[::-1]})
  data_rank_percs = data_rank_percs.with_columns(
    ((pl.col("percentage") * 100).round(0).cast(pl.Int64).cast(pl.String) + '%').alias('perc_label')
  )

  return data_rank_percs

def GetRatings(data):
  data_ratings = data.filter(
    pl.all_horizontal([pl.col(i) > 0 for i in seen_cols])
  ).select(rank_cols)

  data_ratings = data_ratings.with_columns(pl.col('rank_epi_iii').replace("0", "6"))

  thirds = data_ratings.with_columns(
    pl.col(i).replace_strict({
        "1": 1,
        "2": 1,
        "3": 2,
        "4": 2,
        "5": 3,
        "6": 3
    }, default=0).alias(i) for i in data_ratings.columns
  )

  third_counts = thirds.unpivot(variable_name="movie").group_by(['movie', 'value']).count().pivot('value', index='movie').sort('movie')
  third_counts = third_counts.with_columns(
    movie=pl.Series(movies)
  )

  third_percs = third_counts.with_columns(
    [(pl.col(third_counts.columns[i]) / thirds.height).alias(third_counts.columns[i]) for i in range(1, 4)]
  )

  third_percs = third_percs.rename({"1": 'Top third', "2": 'Middle third', "3": 'Bottom third'})

  # third_percs = third_percs.with_columns(
  #   ((pl.col(third_percs.columns[i]) * 100).round(0).cast(pl.Int64).cast(pl.String) + '%').alias(f"{third_percs.columns[i]}_label") for i in range(1, 4)
  # )
  
  third_long = third_percs.unpivot(index='movie', variable_name='rating', value_name='percentage')

  # third_long = third_long.with_columns(
  #   pl.col('rating').replace({
  #     "1": 'Top third',
  #     "2": 'Middle third',
  #     "3": 'Bottom third'
  #   })
  # )

  third_long = third_long.with_columns(
    ((pl.col("percentage") * 100).round(0).cast(pl.Int64).cast(pl.String) + '%').alias('perc_label')
  )

  return third_long

QUESTION 1

  1. Build a machine learning model that predicts whether a person makes at least $50k with accuracy of at least 65%. Describe your model and report the accuracy.

Describe your model and report the accuracy.

Show the code
# Include and execute your code here
rnd = 343
X_trn, X_tst, y_trn, y_tst = train_test_split(X, y, random_state=rnd, test_size=0.2)

model = GradientBoostingClassifier()
model.fit(X_trn, y_trn)
pred = model.predict(X_tst)
print(classification_report(y_tst, pred))
              precision    recall  f1-score   support

           0       0.57      0.56      0.57        62
           1       0.43      0.39      0.41        23
           2       0.38      0.35      0.36        40
           4       0.38      0.43      0.41        76
           6       0.15      0.13      0.14        23
           8       0.15      0.14      0.15        14

    accuracy                           0.40       238
   macro avg       0.34      0.34      0.34       238
weighted avg       0.40      0.40      0.40       238

QUESTION 2

  1. Validate the data provided on GitHub lines up with the article by recreating a 3rd visual from the article.

Describe the images.

Show the code
# Five charts:
# Which 'Star Wars' Movies Have You Seen?
df_seen = GetSeen(df_tot)
movie_graph = (
    ggplot(data=df_seen)
    + geom_bar(mapping=aes(x='percentage', y='movie'), stat='identity', orientation='y', color="lightblue", fill="lightblue")
    + labs(
      title="Which 'Star Wars' Movie's Have You Seen?",
      subtitle="Of 835 respondents who have seen any film",
      x='',
      y=''
    )
    + scale_x_continuous(limits=[0,1])
    + geom_text(aes(x='percentage', y='movie', label='perc_label'), nudge_x=0.075, size=12, color='black')
    + theme(
        panel_background=element_rect(fill='gray', linetype=0),
        plot_background=element_rect(fill='gray'),
        # panel_grid_major=element_rect(fill='gray'),
        panel_grid=element_blank(),
        legend_background=element_rect(fill='gray'),
        axis_text=element_text(color='black', size=18),
        # axis_title=element_text(color='white'),
        plot_title=element_text(color='black', face="bold", hjust=0, size=25),
        plot_subtitle=element_text(color='black', hjust=0, size=20),
        legend_text=element_text(color='white'),
        legend_title=element_text(color='white'),
        label_text=element_text(color='white'),
        axis_line_x=element_blank(),
        axis_ticks_x=element_blank(),
        axis_text_x=element_blank(),
        plot_title_position='plot'
    )
    + ggsize(1600, 900)
)

display(movie_graph)
# What's the Best 'Star Wars' Movie?
df_ranks = GetRanks(df_tot)
rank_graph = (
    ggplot(data=df_ranks)
    + geom_bar(mapping=aes(x='percentage', y='movie'), stat='identity', orientation='y', color="lightblue", fill="lightblue")
    + labs(
      title="What's the Best 'Star Wars' Movie?",
      subtitle="Of 471 respondents who have seen all 6 films",
      x='',
      y=''
    )
    + scale_x_continuous(limits=[0,0.4])
    + geom_text(aes(x='percentage', y='movie', label='perc_label'), nudge_x=0.035, size=12, color='black')
    + theme(
        panel_background=element_rect(fill='gray', linetype=0),
        plot_background=element_rect(fill='gray'),
        panel_grid=element_blank(),
        legend_background=element_rect(fill='gray'),
        axis_text=element_text(color='black', size=18),
        plot_title=element_text(color='black', face="bold", hjust=0, size=25),
        plot_subtitle=element_text(color='black', hjust=0, size=20),
        legend_text=element_text(color='white'),
        legend_title=element_text(color='white'),
        label_text=element_text(color='white'),
        axis_line_x=element_blank(),
        axis_ticks_x=element_blank(),
        axis_text_x=element_blank(),
        plot_title_position='plot'
    )
    + ggsize(1600, 900)
)

display(rank_graph)
# How People Rate the 'Star Wars' Movies
df_ratings = GetRatings(df_tot)
rate_graph = (
    ggplot(data=df_ratings)
    + geom_bar(mapping=aes(x='percentage', y='movie', color='rating', fill='rating'), stat='identity', orientation='y', position='dodgev')
    + facet_wrap(facets='rating', ncol=3)
    + guides(color='none', fill='none')
    + labs(
      title="How People Rate the 'Star Wars' Movies",
      subtitle="How often each film was rated in the top, middle, and bottom third (by 471 respondents who have seen all six films)",
      x='',
      y=''
    )
    # + scale_x_continuous(limits=[0,0.4])
    + geom_text(aes(x='percentage', y='movie', label='perc_label'), nudge_x=0.15, size=12, color='black')
    + theme(
        panel_background=element_rect(fill='gray', linetype=0),
        plot_background=element_rect(fill='gray'),
        panel_grid=element_blank(),
        legend_background=element_rect(fill='gray'),
        axis_text=element_text(color='black', size=18),
        plot_title=element_text(color='black', face="bold", hjust=0, size=25),
        plot_subtitle=element_text(color='black', hjust=0, size=20),
        legend_text=element_text(color='white'),
        legend_title=element_text(color='white'),
        label_text=element_text(color='white'),
        axis_line_x=element_blank(),
        axis_ticks_x=element_blank(),
        axis_text_x=element_blank(),
        plot_title_position='plot'
    )
    + ggsize(1800, 900)
)

display(rate_graph)
# 'Star Wars' Character Favorability Ratings
# Who Shot First?

Question 3

  1. Create a new column that converts the location groupings to a single number (a.k.a. label encoding). Drop the location categorical column.

Describe the changes.